library(SummarizedExperiment)
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colnames,
## do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, lengths, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff,
## sort, table, tapply, union, unique, unsplit
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(edgeR)
library(sva)
## Loading required package: mgcv
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:IRanges':
##
## collapse
## This is mgcv 1.8-12. For overview type 'help("mgcv-package")'.
## Loading required package: genefilter
##
## Attaching package: 'genefilter'
## The following object is masked from 'package:base':
##
## anyNA
library(ggplot2)
library(geneplotter)
## Loading required package: lattice
## Loading required package: annotate
## Loading required package: AnnotationDbi
## Loading required package: XML
library(org.Hs.eg.db)
## Loading required package: DBI
##
library(GOstats)
## Loading required package: Category
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:IRanges':
##
## expand
## Loading required package: GO.db
##
## Loading required package: graph
##
## Attaching package: 'graph'
## The following object is masked from 'package:XML':
##
## addNode
##
## Attaching package: 'GOstats'
## The following object is masked from 'package:AnnotationDbi':
##
## makeGOGraph
library(biomaRt)
library(KEGG.db)
##
## KEGG.db contains mappings based on older data because the original
## resource was removed from the the public domain before the most
## recent update was produced. This package should now be
## considered deprecated and future versions of Bioconductor may
## not have it available. Users who want more current data are
## encouraged to look at the KEGGREST or reactome.db packages
#library(e1071)
library(GSEABase)
library(GSVAdata)
## Loading required package: hgu95a.db
##
We load the raw RNA-seq data counts set of Kidney renal clear-cell-carcinoma.
se <- readRDS(file.path("~/GitHub/KIDNEY/seKIRC.rds"))
se
## class: RangedSummarizedExperiment
## dim: 20115 614
## metadata(5): experimentData annotation cancerTypeCode
## cancerTypeDescription objectCreationDate
## assays(1): counts
## rownames(20115): 1 2 ... 102724473 103091865
## rowRanges metadata column names(3): symbol txlen txgc
## colnames(614): TCGA.3Z.A93Z.01A.11R.A37O.07
## TCGA.6D.AA2E.01A.11R.A37O.07 ... TCGA.CZ.5988.11A.01R.1672.07
## TCGA.CZ.5989.11A.01R.1672.07
## colData names(549): type bcr_patient_uuid ...
## lymph_nodes_aortic_pos_by_ihc lymph_nodes_aortic_pos_total
We take a look to the column data, which include the phenotipic information of all samples of the sumarized experiment object.
dim(colData(se))
## [1] 614 549
colData(se)[1:5, 1:5]
## DataFrame with 5 rows and 5 columns
## type bcr_patient_uuid
## <factor> <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07 tumor 2B1DEA0A-6D55-4FDD-9C1C-0D9FBE03BD78
## TCGA.6D.AA2E.01A.11R.A37O.07 tumor D3B47E53-6F40-4FC8-B5A4-CBE548A770A9
## TCGA.A3.3306.01A.01R.0864.07 tumor 9fb55e0b-43d8-40a3-8ef2-d198e6290551
## TCGA.A3.3307.01A.01R.0864.07 tumor 7ac1d6c6-9ade-49af-8794-10b5b96b2b05
## TCGA.A3.3308.01A.02R.1325.07 tumor 3cbca837-f5a7-4a87-8f02-c59eac232d5a
## bcr_patient_barcode form_completion_date
## <factor> <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07 TCGA-3Z-A93Z 2014-11-11
## TCGA.6D.AA2E.01A.11R.A37O.07 TCGA-6D-AA2E 2014-3-17
## TCGA.A3.3306.01A.01R.0864.07 TCGA-A3-3306 2010-8-23
## TCGA.A3.3307.01A.01R.0864.07 TCGA-A3-3307 2010-4-13
## TCGA.A3.3308.01A.02R.1325.07 TCGA-A3-3308 2010-4-12
## prospective_collection
## <factor>
## TCGA.3Z.A93Z.01A.11R.A37O.07 YES
## TCGA.6D.AA2E.01A.11R.A37O.07 YES
## TCGA.A3.3306.01A.01R.0864.07 NO
## TCGA.A3.3307.01A.01R.0864.07 NO
## TCGA.A3.3308.01A.02R.1325.07 NO
We also need to observe to the metadata information content of these data.
mcols(colData(se), use.names=TRUE)
## DataFrame with 549 rows and 2 columns
## labelDescription
## <character>
## type sample type (tumor/normal)
## bcr_patient_uuid bcr patient uuid
## bcr_patient_barcode bcr patient barcode
## form_completion_date form completion date
## prospective_collection tissue prospective collection indicator
## ... ...
## lymph_nodes_pelvic_pos_total total pelv lnp
## lymph_nodes_aortic_examined_count total aor lnr
## lymph_nodes_aortic_pos_by_he aln pos light micro
## lymph_nodes_aortic_pos_by_ihc aln pos ihc
## lymph_nodes_aortic_pos_total total aor-lnp
## CDEID
## <character>
## type NA
## bcr_patient_uuid NA
## bcr_patient_barcode 2673794
## form_completion_date NA
## prospective_collection 3088492
## ... ...
## lymph_nodes_pelvic_pos_total 3151828
## lymph_nodes_aortic_examined_count 3104460
## lymph_nodes_aortic_pos_by_he 3151832
## lymph_nodes_aortic_pos_by_ihc 3151831
## lymph_nodes_aortic_pos_total 3151827
As we can see there are three different columns on the metadata result. The first column is the name of all the variables; the second is the labelDescription, which is the definition of each variable; and the last one corresponding to the Common Data Element Identifier (CDEID).
We look through the data rows, which are the feature elements, to see the genes information of our data set.
rowRanges(se)
## GRanges object with 20115 ranges and 3 metadata columns:
## seqnames ranges strand | symbol txlen
## <Rle> <IRanges> <Rle> | <character> <integer>
## 1 chr19 [58345178, 58362751] - | A1BG 3322
## 2 chr12 [ 9067664, 9116229] - | A2M 4844
## 9 chr8 [18170477, 18223689] + | NAT1 2280
## 10 chr8 [18391245, 18401218] + | NAT2 1322
## 12 chr14 [94592058, 94624646] + | SERPINA3 3067
## ... ... ... ... ... ... ...
## 100996331 chr15 [20835372, 21877298] - | POTEB 1706
## 101340251 chr17 [40027542, 40027645] - | SNORD124 104
## 101340252 chr9 [33934296, 33934376] - | SNORD121B 81
## 102724473 chrX [49303669, 49319844] + | GAGE10 538
## 103091865 chr21 [39313935, 39314962] + | BRWD1-IT2 1028
## txgc
## <numeric>
## 1 0.5644190
## 2 0.4882329
## 9 0.3942982
## 10 0.3895613
## 12 0.5249429
## ... ...
## 100996331 0.4308324
## 101340251 0.4903846
## 101340252 0.4074074
## 102724473 0.5055762
## 103091865 0.5924125
## -------
## seqinfo: 455 sequences (1 circular) from hg38 genome
We create an object to hold the dataset to be analysed in a better and more comprehensive way.
dge <- DGEList(counts=assays(se)$counts, genes=mcols(se))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored
Moreover, we calculate \(\log_2\) CPM values of expression and we save them in an assay element.
assays(se)$logCPM <- cpm(dge, log=TRUE, prior.count=0.5)
At this point, we need to determine if the library sizes of tumor samples and normal samples are similar or not.
libsize <- data.frame(libsize = dge$sample$lib.size/1e6, type = se$type)
summary(libsize)
## libsize type
## Min. : 3.049 normal: 72
## 1st Qu.: 38.075 tumor :542
## Median : 46.715
## Mean : 46.679
## 3rd Qu.: 54.647
## Max. :115.546
Figure S1: Density distribution of library size.
This past figure shows the density distribution on sequencing depth (milions of reads).
Figure S2: Histogram of library size distribution.
In this other figure, we can see now the proportion of samples in each sequencing depth (Milions of Reads).
As we can see, both figures show that the normal and tumor samples are similar in terms of sequencing depth, except for some samples that show extreme values.
Moreover, we can see how both normal and tumor types, follow a normal distribution through the sequencing depth.
At this point, we can filter out those samples with a low sequencing depth (<45 Milions of reads) because they are less reliable.
dge.filt <- dge[, dge$sample$lib.size/1e6 > 45 ]
final.samples <- rownames(dge.filt$samples)
se.filt <-se[,colnames(se) %in% final.samples]
se.normal <- se.filt[,se.filt$type == "normal"]
se.tumor <- se.filt[,se.filt$type == "tumor"]
normal.code <- substr(colnames(se.normal), 9, 12)
tumor.code <- substr(colnames(se.tumor), 9, 12)
common.codes <- intersect(normal.code, tumor.code)
length(common.codes)
## [1] 38
se.paired <- se.filt[,substr(colnames(se.filt), 9, 12) %in% common.codes]
summary(se.paired$type)
## normal tumor
## 38 38
colData(se.paired)$samplecodes <- substr(colnames(se.paired), 9, 12)
dge.filt <- DGEList(counts=assays(se.paired)$counts, genes=mcols(se.paired))
## Warning in as.data.frame.DataFrame(genes, stringsAsFactors = FALSE):
## Arguments in '...' ignored
Figure S3: Library sizes in increasing order.
par(mfrow=c(1,2), mar=c(4,5,1,1))
In the next part, we take a look on the distribution of expression values per sample in terms of logarithmic CPM units.
Figure S4: Multidensity plot of log2CPM.
We cannot observe significant diferences between samples in terms of expression levels.
As we have more than 200 samples, we now display the normal and tumor distribution separately.
normalCPM <- se.paired[,se.paired$type == "normal"]
tumorCPM <- se.paired[,se.paired$type == "tumor"]
Figure S5: Multidensity plot of normal samples log2CPM.
Figure S6: Multidensity plot of tumor samples log2CPM.
Now, we want to filter out the genes with a low expression. To do that, we can plot the CPMs in logarithmic scale.
genemean <- data.frame(Means=aveLogCPM(assays(se.paired)$counts))
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
Figure S7: Distribution of expression levels among genes.
We can see how there are two peaks of genes. The first one correspond to these genes with a very low expression and thus we decide to remove them. In other to do that, we used a minimum average of 1 LogCPM unit as a cutoff to filter out lowly expressed genes.
avgexp <- aveLogCPM(assays(se.paired)$counts)
mask <- avgexp > 1
These are the numbers of genes before filtering:
dim(se.paired) # Before filtering SE
## [1] 20115 76
dim(dge.filt) # Before filtering DGE
## [1] 20115 76
These are the numbers of samples genes after filtering:
se.paired.genes <- se.paired[mask, ]
dim(se.paired.genes) # After filtering SE
## [1] 12495 76
dge.filt <- dge.filt[mask, ]
dim(dge.filt) # After filtering DGE
## [1] 12495 76
At this point, we can calculate the normaliation factors on the filtered expression data set.
dgenorm <- calcNormFactors(dge.filt)
assays(se.paired.genes)$logCPMnorm <- cpm(dgenorm, log=TRUE, prior.count=0.5)
We look at the MA-plots of the normal samples to see if there are systematic biases in gene expression levels in any of the sampes. We expect the slope to be around zero.
Figure S8: MA-plots of normal samples.
After observing them, one normal sample has been identified with a biased gene expression, so we take it out of our dataset. This sample is the TGCA-CW-5591.
dim(se.paired.genes) # Before filtering SE
## [1] 12495 76
dim(dge.filt) # Before filtering DGE
## [1] 12495 76
se.paired.genes <- se.paired.genes[,!se.paired.genes$bcr_patient_barcode %in% "TCGA-CW-5591"]
dim(se.paired.genes) # After filtering SE
## [1] 12495 74
dge.filt <- dge.filt[, !substr(rownames(dge.filt$samples), 1, 12) %in% "TCGA.CW.5591"]
dim(dge.filt) # After filtering DGE
## [1] 12495 74
dgenorm <- dgenorm[, !substr(rownames(dgenorm$samples), 1, 12) %in% "TCGA.CW.5591"]
dim(dgenorm) # After filtering DGE
## [1] 12495 74
Now, we examine the tumor samples:
Figure S9: MA-plots of tumor samples.
As we can see, we don’t see any tumor samples with major biases in gene expression
Now we’re going to analyze our data in order to search for batch effect that could interfere with the biological signal. First, we analyze some of the information contained in the barcode, such as tissue source site, center of sequenciation, plate and portion and analyte combinations. We use two approaches to try to identify batch effect, the Hierarchical Clustering and a Multidimensional Scaling plot.
tss <- substr(colnames(se.paired.genes), 6, 7)
table(tss)
## tss
## B0 B2 B8 CJ CW CZ
## 14 2 4 4 14 36
center <- substr(colnames(se.paired.genes), 27, 28)
table(center)
## center
## 07
## 74
plate <- substr(colnames(se.paired.genes), 22, 25)
table(plate)
## plate
## 1503 1541 1672
## 38 18 18
portionanalyte <- substr(colnames(se.paired.genes), 18, 20)
table(portionanalyte)
## portionanalyte
## 01R 02R 11R
## 58 4 12
samplevial <- substr(colnames(se.paired.genes), 14, 16)
table(samplevial)
## samplevial
## 01A 01B 11A
## 36 1 37
table(data.frame(TYPE=se.paired.genes$type, GENDER=se.paired.genes$gender))
## GENDER
## TYPE FEMALE MALE
## normal 13 24
## tumor 13 24
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.paired.genes$gender))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(se.paired.genes$gender))), fill=sort(unique(batch)))
Figure S10: Hierarchical clustering of samples.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Gender", sort(unique(batch)), levels(factor(se.paired.genes$gender))),
fill=sort(unique(batch)), inset=0.05)
Figure S11: Multidimensional scaling plot of samples separated by gender.
table(data.frame(TYPE=se.paired.genes$type, TSS=tss))
## TSS
## TYPE B0 B2 B8 CJ CW CZ
## normal 7 1 2 2 7 18
## tumor 7 1 2 2 7 18
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(tss))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TSS", sort(unique(batch)), levels(factor(tss))), fill=sort(unique(batch)))
Figure S12: Hierarchical clustering of samples separated by TSS.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("TSS", sort(unique(batch)), levels(factor(tss))),
fill=sort(unique(batch)), inset=0.05)
Figure S13: Multidimensional scaling plot of samples separated by TSS.
table(data.frame(TYPE=se.paired.genes$type, PLATE=plate))
## PLATE
## TYPE 1503 1541 1672
## normal 19 9 9
## tumor 19 9 9
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(plate))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("PLATE", sort(unique(batch)), levels(factor(plate))), fill=sort(unique(batch)))
Figure S14: Hierarchical clustering of the samples separated by PLATE.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("plate", sort(unique(batch)), levels(factor(plate))),
fill=sort(unique(batch)), inset=0.05)
Figure S15: Multidimensional scaling plot of the samples separated by PLATE.
table(data.frame(TYPE=se.paired.genes$type, P.analyte = portionanalyte))
## P.analyte
## TYPE 01R 02R 11R
## normal 35 2 0
## tumor 23 2 12
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.paired.genes)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(portionanalyte))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.paired.genes)
outcome <- paste(substr(colnames(se.paired.genes), 9, 12), as.character(se.paired.genes$type), sep="-")
names(outcome) <- colnames(se.paired.genes)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))), fill=sort(unique(batch)))
Figure S16: Hierarchical clustering of the samples separated by PortionAnalyte.
plotMDS(dgenorm, labels=outcome, col=batch)
legend("bottomleft", paste("Portionanalyte", sort(unique(batch)), levels(factor(portionanalyte))),
fill=sort(unique(batch)), inset=0.05)
Figure S17: Multidimensional scaling plot of samples separated by PortionAnalyte.
The Hierarchical clustering and the Multidimensional Scaling plots show a clear differentiation between normal samples and tumor samples. All four plots from the different elements of the barcode have a similar clustering and distribution. There is no clear effect from the batch indicators in the clustering, and it seems to have a balanced design (more clearly in tumor than in normal), although it seems that 9 tumor samples cluster apart from the other samples in the hierarchical clustering, so we should consider taking apart this samples.
In order to identify differentially expressed genes, we used a linear model with two factors: one being the cell type (tumor or normal) and the other being the individual/sample.
We chose a False Discovery Rate of 0.01 (1%). We corrected#### Over represented KEGGs
design <- model.matrix(~type + samplecodes, data = colData(se.paired.genes))
v <- voom(dgenorm, design, plot=FALSE)
dim(se.paired.genes)
## [1] 12495 74
dim(dgenorm)
## [1] 12495 74
FDRcutoff <- 0.01
mod0 <- model.matrix(~samplecodes, colData(se.paired.genes))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
design.voom <- ""
design.voom <- cbind(design, sv$sv)
fit <- lmFit(v, design.voom)
fit <- eBayes(fit)
res <- decideTests(fit, p.value = FDRcutoff)
Figure S18: Volcano plot of DE analysis.
colnames(design)
toptable <- topTable(fit, coef = 2, n=Inf)
Figure S19: Pvalue distribution and qqplot.
summary(toptable)
## symbol txlen txgc logFC
## Length:12495 Min. : 43 Min. :0.2133 Min. :-12.46608
## Class :character 1st Qu.: 1738 1st Qu.:0.4223 1st Qu.: -0.40791
## Mode :character Median : 2970 Median :0.4858 Median : 0.07708
## Mean : 3592 Mean :0.4954 Mean : 0.02842
## 3rd Qu.: 4738 3rd Qu.:0.5654 3rd Qu.: 0.54939
## Max. :91667 Max. :0.8636 Max. : 10.11549
## AveExpr t P.Value
## Min. :-5.464 Min. :-34.1775 Min. :0.0000000
## 1st Qu.: 3.199 1st Qu.: -4.6644 1st Qu.:0.0000000
## Median : 4.901 Median : 0.9748 Median :0.0000023
## Mean : 4.624 Mean : 0.9532 Mean :0.0777713
## 3rd Qu.: 6.139 3rd Qu.: 6.6006 3rd Qu.:0.0134982
## Max. :11.941 Max. : 45.0974 Max. :0.9982831
## adj.P.Val B
## Min. :0.0000000 Min. :-8.491
## 1st Qu.:0.0000000 1st Qu.:-5.036
## Median :0.0000045 Median : 3.431
## Mean :0.0832298 Mean : 6.350
## 3rd Qu.:0.0179971 3rd Qu.:15.017
## Max. :0.9982831 Max. :60.630
Figure S20: Plot of logFC distribution.
under_exp <- toptable[toptable$logFC <= -5 ,]
over_exp <- toptable[toptable$logFC >= 5,]
under_exp.all <- toptable[toptable$logFC <= 0 ,]
over_exp.all <- toptable[toptable$logFC >= 0,]
test_overexp <- toptable[toptable$logFC >= 3,]
dim(under_exp)
## [1] 153 9
dim(over_exp)
## [1] 46 9
head(over_exp[order(over_exp$logFC, decreasing = TRUE),])
## symbol txlen txgc logFC AveExpr t P.Value
## 2184 FAH 3427 0.5170703 10.115493 0.8164831 16.72456 1.562600e-17
## 81617 CAB39L 3692 0.4320152 9.104303 3.4604998 22.65289 1.746516e-21
## 389320 TEX43 464 0.4030172 8.527648 -1.9867210 17.35212 5.303748e-18
## 973 CD79A 1258 0.6073132 8.336017 0.9896687 26.65775 1.142480e-23
## 5965 RECQL 3702 0.3725014 7.984984 2.0659290 17.20022 6.869086e-18
## 401387 LRRD1 2830 0.3395760 7.622375 -0.7109079 18.74206 5.402535e-19
## adj.P.Val B
## 2184 2.532386e-16 29.46327
## 81617 1.536811e-19 38.72011
## 389320 1.022690e-16 29.64651
## 973 2.833658e-21 42.55773
## 5965 1.275323e-16 30.74923
## 401387 1.480366e-17 32.07925
head(under_exp[order(under_exp$logFC, decreasing = FALSE),])
## symbol txlen txgc logFC AveExpr t P.Value
## 400713 ZNF880 2230 0.3995516 -12.46608 6.106883 -22.48152 2.203720e-21
## 361 AQP4 5274 0.3795980 -12.09207 5.307332 -22.84434 1.349424e-21
## 125875 CLDND2 974 0.6190965 -11.38629 1.464706 -24.75417 1.138355e-22
## 2592 GALT 2566 0.5420889 -10.99625 2.085865 -24.36734 1.852009e-22
## 8649 LAMTOR3 4226 0.3774255 -10.96988 3.871072 -22.36824 2.572036e-21
## 391196 OR2M7 939 0.4675186 -10.65841 1.105295 -27.39939 4.857318e-24
## adj.P.Val B
## 400713 1.835699e-19 38.66244
## 361 1.286424e-19 38.94593
## 125875 1.725189e-20 39.02897
## 2592 2.461793e-20 39.31175
## 8649 2.034024e-19 38.14609
## 391196 1.619172e-21 41.34597
toptable[toptable$symbol == "PBRM1",]
## symbol txlen txgc logFC AveExpr t P.Value
## 55193 PBRM1 7523 0.4035624 4.146581 2.805781 18.2486 1.195662e-18
## adj.P.Val B
## 55193 2.873037e-17 32.50217
toptable[toptable$symbol == "SETD2",]
## symbol txlen txgc logFC AveExpr t P.Value
## 29072 SETD2 8172 0.4274351 3.342995 1.995913 19.56703 1.487443e-19
## adj.P.Val B
## 29072 5.28e-18 34.61302
toptable[toptable$symbol == "AKT",]
## [1] symbol txlen txgc logFC AveExpr t P.Value
## [8] adj.P.Val B
## <0 rows> (or 0-length row.names)
toptable[toptable$symbol == "VHL1",]
## [1] symbol txlen txgc logFC AveExpr t P.Value
## [8] adj.P.Val B
## <0 rows> (or 0-length row.names)
# PBRM1 i SETD2 is over y es BIEN.
data(c2BroadSets)
# Filtering to only KEGG, Reactome and BioCarta
gsc <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)),
grep("^REACTOME", names(c2BroadSets)), grep("^BIOCARTA", names(c2BroadSets)))]
gsc <- mapIdentifiers(gsc, AnnoOrEntrezIdentifier(metadata(se.paired.genes)$annotation))
gsmatrix <- incidence(gsc)
dim(gsmatrix)
## [1] 833 6744
# Discard genes not in our data and genes not in the gene sets
gsmatrix <- gsmatrix[, colnames(gsmatrix) %in% rownames(se.paired.genes)]
dim(gsmatrix)
## [1] 833 4038
se.paired.gsea <- se.paired.genes[colnames(gsmatrix), ]
dim(se.paired.genes)
## [1] 12495 74
dim(se.paired.gsea)
## [1] 4038 74
dgenorm.gsea <- dgenorm[colnames(gsmatrix), ]
dim(dgenorm)
## [1] 12495 74
dim(dgenorm.gsea)
## [1] 4038 74
## Filtering gene sets with a minimum size of 5
gsmatrix <- gsmatrix[rowSums(gsmatrix) >= 5, ]
dim(gsmatrix)
## [1] 807 4038
design <- model.matrix(~type + samplecodes, data = colData(se.paired.gsea))
v <- voom(dgenorm, design, plot=FALSE)
FDRcutoff <- 0.01
mod0 <- model.matrix(~samplecodes, colData(se.paired.gsea))
sv <- sva(v$E, mod = design, mod0 = mod0)
## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
design.voom <- ""
design.voom <- cbind(design, sv$sv)
fit <- lmFit(v, design.voom)
fit <- eBayes(fit)
toptable.gsea <- topTable(fit, coef = 2, n=Inf)
tGSgenes <- toptable.gsea[match(colnames(gsmatrix), rownames(toptable.gsea)), "t"]
length(tGSgenes)
## [1] 4038
head(tGSgenes)
## [1] -4.768989 -4.177214 1.941461 12.904205 -11.136535 2.767150
zS <- sqrt(rowSums(gsmatrix)) * (as.vector(gsmatrix %*% tGSgenes)/rowSums(gsmatrix))
length(zS)
## [1] 807
head(zS)
## KEGG_GLYCOLYSIS_GLUCONEOGENESIS
## -15.558933
## KEGG_CITRATE_CYCLE_TCA_CYCLE
## -1.217683
## KEGG_PENTOSE_PHOSPHATE_PATHWAY
## -1.592216
## KEGG_PENTOSE_AND_GLUCURONATE_INTERCONVERSIONS
## -4.944474
## KEGG_FRUCTOSE_AND_MANNOSE_METABOLISM
## -13.367401
## KEGG_GALACTOSE_METABOLISM
## -14.827441
Figure S21: GSEA qqplot of normal samples.
rnkGS <- sort(abs(zS), decreasing = TRUE)
head(rnkGS,20)
## KEGG_OXIDATIVE_PHOSPHORYLATION
## 51.25554
## KEGG_PRIMARY_IMMUNODEFICIENCY
## 41.91329
## REACTOME_DOWNSTREAM_EVENTS_IN_GPCR_SIGNALING
## 40.84898
## REACTOME_SLC_MEDIATED_TRANSMEMBRANE_TRANSPORT
## 39.72385
## KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
## 35.65341
## KEGG_JAK_STAT_SIGNALING_PATHWAY
## 34.85214
## KEGG_FATTY_ACID_METABOLISM
## 33.10837
## KEGG_PATHWAYS_IN_CANCER
## 32.33795
## KEGG_VASCULAR_SMOOTH_MUSCLE_CONTRACTION
## 32.31561
## REACTOME_TRANSMEMBRANE_TRANSPORT_OF_SMALL_MOLECULES
## 32.18810
## REACTOME_GLUCOSE_AND_OTHER_SUGAR_SLC_TRANSPORTERS
## 31.77223
## BIOCARTA_CELLCYCLE_PATHWAY
## 31.56679
## REACTOME_CHEMOKINE_RECEPTORS_BIND_CHEMOKINES
## 31.51400
## REACTOME_G_ALPHA_Q_SIGNALLING_EVENTS
## 31.31659
## BIOCARTA_TCAPOPTOSIS_PATHWAY
## 31.13932
## KEGG_SMALL_CELL_LUNG_CANCER
## 30.89871
## KEGG_HEMATOPOIETIC_CELL_LINEAGE
## 30.09452
## BIOCARTA_THELPER_PATHWAY
## 29.81463
## REACTOME_PD1_SIGNALING
## 29.49595
## BIOCARTA_SALMONELLA_PATHWAY
## 29.44496
## plot stuff
plotGS <- function(se, gs, pheno, ...) {
l <- levels(colData(se)[, pheno])
idxSamples1 <- colData(se)[, pheno] == l[1]
idxSamples2 <- colData(se)[, pheno] == l[2]
exps1 <- rowMeans(assays(se)$logCPM[gs, idxSamples1])
exps2 <- rowMeans(assays(se)$logCPM[gs, idxSamples2])
rng <- range(c(exps1, exps2))
plot(exps1, exps2, pch = 21, col = "black", bg = "black", xlim = rng, ylim = rng,
xlab = l[1], ylab = l[2], ...)
abline(a = 0, b = 1, lwd = 2, col = "red")
}
Figure S22: Scatter plots of gene sets sorted by z-score.
DEgenes <- rownames(toptable)[toptable$adj.P.Val < 0.01]
length(DEgenes)
## [1] 9108
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
annotation="org.Hs.eg.db", ontology="BP",
pvalueCutoff=0.01, testDirection="over")
conditional(params) <- TRUE
hgOver <- hyperGTest(params)
GOresults <- summary(hgOver)
GOresults <- GOresults[GOresults$Size >= 5 & GOresults$Count >= 5,]
GOresults <- GOresults[order(GOresults$OddsRatio, decreasing = TRUE), ]
GOresults <- GOresults[order(GOresults$Pvalue, decreasing = FALSE), ]
htmlReport(hgOver, file = "gotests.html")
DEgenes <- rownames(over_exp.all)[over_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 5094
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
annotation="org.Hs.eg.db", ontology="BP",
pvalueCutoff=0.01, testDirection="over")
conditional(params) <- TRUE
hgOver <- hyperGTest(params)
GOresultsover <- summary(hgOver)
GOresultsover <- GOresultsover[GOresultsover$Size >= 5 & GOresultsover$Count >= 5,]
GOresultsover <- GOresultsover[order(GOresultsover$OddsRatio, decreasing = TRUE), ]
GOresultsover <- GOresultsover[order(GOresultsover$Pvalue, decreasing = FALSE), ]
head(GOresultsover)
## GOBPID Pvalue OddsRatio ExpCount Count Size
## 1 GO:0051301 1.768797e-05 1.533603 165.24901 206 402
## 2 GO:0051251 3.038639e-05 2.063637 53.84980 77 131
## 3 GO:0007156 3.677431e-05 2.469144 34.52964 53 84
## 4 GO:0008283 8.589732e-05 1.275001 459.16206 518 1117
## 5 GO:0032946 1.057712e-04 2.633582 26.71937 42 65
## 6 GO:0050867 1.128832e-04 1.883097 59.60474 82 145
## Term
## 1 cell division
## 2 positive regulation of lymphocyte activation
## 3 homophilic cell adhesion via plasma membrane adhesion molecules
## 4 cell proliferation
## 5 positive regulation of mononuclear cell proliferation
## 6 positive regulation of cell activation
htmlReport(hgOver, file = "gotestsover.html")
DEgenes <- rownames(under_exp.all)[under_exp.all$adj.P.Val < 0.01]
length(DEgenes)
## [1] 4014
geneUniverse <- rownames(se.paired.genes)
params <- new("GOHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
annotation="org.Hs.eg.db", ontology="BP",
pvalueCutoff=0.01, testDirection="over")
conditional(params) <- TRUE
hgUnder <- hyperGTest(params)
GOresultsunder <- summary(hgUnder)
GOresultsunder <- GOresultsunder[GOresultsunder$Size >= 5 & GOresultsunder$Count >= 5,]
GOresultsunder <- GOresultsunder[order(GOresultsunder$OddsRatio, decreasing = TRUE), ]
GOresultsunder <- GOresultsunder[order(GOresultsunder$Pvalue, decreasing = FALSE), ]
head(GOresultsunder)
## GOBPID Pvalue OddsRatio ExpCount Count Size
## 1 GO:0007210 9.939131e-06 25.749672 4.143125 12 13
## 2 GO:0009145 9.815811e-05 4.534902 8.923653 19 28
## 3 GO:0009201 9.815811e-05 4.534902 8.923653 19 28
## 4 GO:0022904 1.096097e-04 2.731924 18.803412 33 59
## 5 GO:0006754 2.123284e-04 4.294813 8.604951 18 27
## 6 GO:0015991 2.123284e-04 4.294813 8.604951 18 27
## Term
## 1 serotonin receptor signaling pathway
## 2 purine nucleoside triphosphate biosynthetic process
## 3 ribonucleoside triphosphate biosynthetic process
## 4 respiratory electron transport chain
## 5 ATP biosynthetic process
## 6 ATP hydrolysis coupled proton transport
htmlReport(hgUnder, file = "gotestsunder.html")
DEgenes <- rownames(toptable)[toptable$adj.P.Val < 0.01]
length(DEgenes)
## [1] 9108
geneUniverse <- rownames(se.paired.genes)
params <- new("KEGGHyperGParams", geneIds=DEgenes, universeGeneIds=geneUniverse,
annotation="org.Hs.eg.db", pvalueCutoff=0.01, testDirection="over")
hgOver <- hyperGTest(params)
hgOver
## Gene to KEGG test for over-representation
## 228 KEGG ids tested (6 have p < 0.01)
## Selected gene set size: 2577
## Gene universe size: 3494
## Annotation package: org.Hs.eg
KEGGresults <- summary(hgOver)
# Filtering by counts and size
KEGGresults <- KEGGresults[KEGGresults$Size >= 5 & KEGGresults$Count >= 5, ]
KEGGresults <- KEGGresults[order(KEGGresults$OddsRatio, decreasing = TRUE), ]
KEGGresults <- KEGGresults[order(KEGGresults$Pvalue, decreasing = FALSE), ]
head(KEGGresults)
## KEGGID Pvalue OddsRatio ExpCount Count Size
## 1 05340 0.001637912 Inf 15.48855 21 21
## 2 04660 0.002350134 3.126865 42.77790 52 58
## 3 05160 0.005940497 2.167665 61.95421 72 84
## 4 00650 0.006277214 8.249021 17.70120 23 24
## 5 00410 0.007574224 Inf 11.80080 16 16
## 6 04144 0.008741059 1.776341 91.45621 103 124
## Term
## 1 Primary immunodeficiency
## 2 T cell receptor signaling pathway
## 3 Hepatitis C
## 4 Butanoate metabolism
## 5 beta-Alanine metabolism
## 6 Endocytosis
When performing the functional enrichment analysis, some fo the gene sets have an OddsRatio value of Infinite. Which is the biological meaning of this?
Then, we performed a Hierarchical clustering and we plotted a Multidimensional scale plot using only the 77 D.E. genes.
head(colnames(colData(se.paired.genes)))
## [1] "type" "bcr_patient_uuid"
## [3] "bcr_patient_barcode" "form_completion_date"
## [5] "prospective_collection" "retrospective_collection"
# Get a mask with DE genes to filter SE object
DE.genes <- rownames(se.paired.genes) %in% rownames(over_exp) |
rownames(se.paired.genes) %in% rownames(under_exp)
# Filter SE object using DE.genes (only paired samples)
se.DE <- se.paired.genes[DE.genes,]
dgenorm.DE <- dgenorm[DE.genes,]
# Filter SE object using DE.genes (all samples)
se.all.DE <- se[DE.genes,]
dgenorm.all.DE <- dge[DE.genes,]
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.DE)$logCPMnorm, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.DE$type))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.DE)
outcome <- as.character(se.DE$type)
names(outcome) <- colnames(se.DE)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TYPE", sort(unique(batch)), levels(factor(se.DE$type))), fill=sort(unique(batch)))
Figure S23: Hierarchical clustering of the samples used in the analysis using only the DE genes.
#logCPM <- cpm(dgenorm, log=TRUE, prior.count=3)
d <- as.dist(1-cor(assays(se.all.DE)$logCPM, method="spearman"))
sampleClustering <- hclust(d)
batch <- as.integer(factor(se.all.DE$type))
sampleDendrogram <- as.dendrogram(sampleClustering, hang=0.1)
names(batch) <- colnames(se.all.DE)
outcome <- as.character(se.all.DE$type)
names(outcome) <- colnames(se.all.DE)
sampleDendrogram <- dendrapply(sampleDendrogram,
function(x, batch, labels) {
if (is.leaf(x)) {
attr(x, "nodePar") <- list(lab.col=as.vector(batch[attr(x, "label")]))
attr(x, "label") <- as.vector(labels[attr(x, "label")])
}
x
}, batch, outcome)
plot(sampleDendrogram, main="Hierarchical clustering of samples")
legend("topright", paste("TYPE", sort(unique(batch)), levels(factor(se.all.DE$type))), fill=sort(unique(batch)))
Figure S24: Hierarchical clustering of all samples using only the DE genes.
sessionInfo()
## R version 3.3.0 (2016-05-03)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
##
## locale:
## [1] LC_CTYPE=es_ES.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=es_ES.UTF-8 LC_COLLATE=es_ES.UTF-8
## [5] LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=es_ES.UTF-8
## [7] LC_PAPER=es_ES.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] xtable_1.8-2 GSVAdata_1.6.0
## [3] hgu95a.db_3.2.2 GSEABase_1.32.0
## [5] KEGG.db_3.2.2 biomaRt_2.26.1
## [7] GOstats_2.36.0 graph_1.48.0
## [9] Category_2.36.0 GO.db_3.2.2
## [11] Matrix_1.2-6 org.Hs.eg.db_3.2.3
## [13] RSQLite_1.0.0 DBI_0.4
## [15] geneplotter_1.48.0 annotate_1.48.0
## [17] XML_3.98-1.1 AnnotationDbi_1.32.3
## [19] lattice_0.20-33 ggplot2_2.1.0
## [21] sva_3.18.0 genefilter_1.52.1
## [23] mgcv_1.8-12 nlme_3.1-128
## [25] edgeR_3.12.1 limma_3.26.9
## [27] SummarizedExperiment_1.0.2 Biobase_2.30.0
## [29] GenomicRanges_1.22.4 GenomeInfoDb_1.6.3
## [31] IRanges_2.4.8 S4Vectors_0.8.11
## [33] BiocGenerics_0.16.1
##
## loaded via a namespace (and not attached):
## [1] splines_3.3.0 colorspace_1.2-6 htmltools_0.3.5
## [4] survival_2.39-4 RBGL_1.46.0 RColorBrewer_1.1-2
## [7] plyr_1.8.3 stringr_1.0.0 zlibbioc_1.16.0
## [10] munsell_0.4.3 gtable_0.2.0 evaluate_0.9
## [13] labeling_0.3 knitr_1.12.3 Rcpp_0.12.4
## [16] KernSmooth_2.23-15 scales_0.4.0 formatR_1.3
## [19] XVector_0.10.0 digest_0.6.9 stringi_1.0-1
## [22] grid_3.3.0 tools_3.3.0 bitops_1.0-6
## [25] magrittr_1.5 RCurl_1.95-4.8 rmarkdown_0.9.6
## [28] AnnotationForge_1.12.2